from pyspark import SparkContext
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StringType, StructField, StructType
import json
import csv
import os

sc = SparkContext('local', 'spark_project')
sc.setLogLevel('WARN')
spark = SparkSession.builder.getOrCreate()

schemaString = "budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count"
fields = [StructField(field, StringType(), True)
          for field in schemaString.split(",")]
schema = StructType(fields)

moviesRdd = sc.textFile('tmdb_5000_movies.csv').map(
    lambda line: Row(*next(csv.reader([line]))))
mdf = spark.createDataFrame(moviesRdd, schema)

mdf.createOrReplaceTempView("movie_data")


mdf.printSchema()
spark.sql("select * from movie_data").show(5,truncate=False)
